Description

This document describes the steps, code, and files used to generate the ~50kb TwinStrand sequencing panel to study the genetics of male infertility. Specifically, we are targeting DNA damage repair genes, known cancer genes, AZF genes, and genes involved in clonal spermatogenesis.

Pre-Processing of Necessary Files

Target Gene List

This is the script used to generate a targeted sequencing panel for the spermseq male infertility project. )

Step 1: Copy the gene list from the spreadsheet and save as a 1-column text file (target_genes.txt) Step 2: Run below script to save into kingspeak UNIX environment

## Change working directory 
cd ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set
## Sort and unique 
cat target_genes.txt | sort | uniq > target_genes_v1.txt
## Copy to kingspeak directory
scp target_genes_v1.txt u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets

GRCh38 GTF File

Use the build 38 GTF file to get other identifying information for each gene_name of interest. Look only at genes/regions that are coding sequences (hence grep -w “CDS” in the below command…)

## Define directory
cd ~/spermseq/twinstrand_target

## Download GTF file 
wget ftp://ftp.ensembl.org/pub/release-102/gtf/homo_sapiens/Homo_sapiens.GRCh38.102.gtf.gz | zless | grep -w "CDS" > Homo_sapiens.GRCh38.102.CDS.gtf
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets/Homo_sapiens.GRCh38.102.CDS.gtf ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/data

## Get the gene_ids of the target genes of interest
cat target_genes_v1.txt | awk '{print "cat Homo_sapiens.GRCh38.102.CDS.gtf | grep \"gene_name \\\""$1"\\\"\""}' | bash | cut -f 9 | awk 'BEGIN{FS = "\"; "} ; {print $0}' | sed 's/\"; /|/g' | sed 's/;//g' | sed 's/ \"/=/g' > target_gene_list_gtf_identifiers.txt

## Copy target_genes_gid_tid_pid.txt file to local 
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets/target_gene_list_gtf_identifiers.txt ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set

Use R script to get gene identifier information from the GTF file

Use R to obtain key GTF fields for each gene: gene_name, gene_id, transcript_id, protein_id, and exon_number.

## Read in the file 
df <- read.table("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/target_gene_list_gtf_identifiers.txt")

## Go through each row and get the gene_id, gene_name, transcript_id, protein_id, exon_number, and exon_id
x <- df[1,]
ids <- rbindlist(apply(df, 1, function(x) {
  fields <- (strsplit(x, split = "|", fixed = TRUE))[[1]]
  fields <- fields[grep(paste(c("gene_name", "gene_id", "transcript_id", "protein_id", "exon_number"), collapse = "|"), fields)] %>% 
    strsplit(., split = "=", fixed = TRUE) %>% unlist()
  
  return(data.table("gene_name"=fields[8],
                    "gene_id"=fields[2],
                    "transcript_id"=fields[4],
                    "protein_id"=fields[10],
                    "exon_number"=fields[6]))
}))
write.table(x = ids, file = "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/target_gene_list_ids.txt", quote = FALSE, sep = "\t", row.names = FALSE)

GTEx gene-level expression data

Obtain GTEx TPM data using gene_ids belonging to the genes of interest.

## Copy the gene list ids file to kingspeak 
scp ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/target_gene_list_ids.txt u1240855@kingspeak.chpc.utah.edu:~/spermseq/twinstrand_targets

## Download master sample list to identify testis samples
cd ~/spermseq/data/GTEx/samples
wget https://storage.googleapis.com/gtex_analysis_v8/annotations/GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt
grep -w -i "testis" GTEx_Analysis_v8_Annotations_SampleAttributesDS.txt | awk '{print $1}' > GTEx_Analysis_v8_Annotations_SampleAttributesDS_TestisOnlySamples_Data.txt

## Download master transcript TPM data
cd ~/spermseq/data/GTEx/transcripts
wget https://storage.googleapis.com/gtex_analysis_v8/rna_seq_data/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz 
gunzip GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct.gz

## Obtain transcript TPM data from testis samples --> see "Get TPM data from GTEx testis samples"
cd ~/spermseq/data/GTEx/transcripts
head -n 1 GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct > header.temp ## Get the header which contains sample ids 
## Run this if the file already exists: rm ~/spermseq/data/GTEx/transcripts/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct
cat ~/spermseq/twinstrand_targets/target_gene_list_ids.txt | awk -v OFS='\t' '{print $1, $2}' | sort | uniq | grep -v "gene_name" | cut -f 2 | awk '{print "cat $HOME/spermseq/data/GTEx/transcripts/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_transcript_tpm.gct | grep "$1" "}' | bash >> ~/spermseq/data/GTEx/transcripts/testis.data.temp
cat header.temp testis.data.temp > GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct
## Remove files 
rm header.temp
rm testis.data.temp

## Copy final file to local directory
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/GTEx/transcripts/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/data

APPRIS isoform data

## Download appris principal isoform data
cd ~/spermseq/data/appris
wget http://apprisws.bioinfo.cnio.es/pub/current_release/datafiles/homo_sapiens/GRCh38/appris_data.principal.txt

## Copy to local directory
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/appris/appris_data.principal.txt ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/data

General analysis workflow

[insert image]

R scripts:

Set up R environment

knitr::opts_chunk$set(echo=TRUE)
######################################
##### Load in necessary packages #####
######################################
library(data.table)
library(dplyr)
library(ggplot2)
library(beeswarm)
library(ggbeeswarm)
library(reshape)
library(ggpubr)
library(biomaRt)
library(ensembldb)
library(EnsDb.Hsapiens.v86)

##################################
##### Specify file locations #####
##################################
appris_file <- "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/appris_data.principal.txt"
gtf_file <- "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/Homo_sapiens.GRCh38.102.CDS.gtf"
GTEx_file <- "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/GTEx_Analysis_2017-06-05_v8_RSEMv1.3.0_TESTIS_transcript_tpm.gct"
GTEx_samples <- "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/GTEx_testis_samples.txt"

#########################################
##### Specify the genes of interest #####
#########################################
genes <- c("WT1", "CDC14A", "DMRT1", "KLHL10", "M1AP", "MEI1", "STAG3", 
           "SYCP2", "SYCP3", "TEX11", "USP26", "PKD1", "AR", "AKT3", "APOA1", 
           "APC", "ATM", "BRAF", "BRCA1", "BRCA2", "CBL", "CDKN2A", "CTNNB1", "DICER1", 
           "DNMT3A", "ERCC1", "ESR1", "FANCM", "FGFR2", "FGFR3", "H3-3A", "H3-3B", "HRAS", 
           "KIT", "KRAS", "LIG4", "MAP2K1", "MAP2K2", "MLH1", "MLH3", "MSH5", "NR5A1", "NF1", 
           "PMS2", "PPM1D", "PRKACA", "PTCH1", "PTPN11", "RAG1", "RAF1", "RB1", "RET", "SMAD4", 
           "SOS1", "STAT3", "STK11", "TEX14", "TEX15", "TSHR", "VHL", "XPA", "XRCC1")
moderate_genes <- c("USP26", "TEX14", "SYCP2", "STAG3", "PKD1", "M1AP", "KLHL10", "DMRT1", "CDC14A")
moderate_gene_ids <- c("ENSG00000134588", "ENSG00000121101", "ENSG00000196074", "ENSG00000066923", "ENSG00000008710", "ENSG00000159374", "ENSG00000161594", "ENSG00000137090", "ENSG00000079335")
genes <- genes[-which(genes %in% moderate_genes)]
genes <- c(genes, "USP9Y", "DDX3Y", "PRY")

Get TPM data from GTEx testis samples

source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/testis_GTEx_samples.R")
testis_GTEx_tpm <- testis_GTEx_samples(GTEx_file = GTEx_file, GTEx_samples = GTEx_samples)
head(testis_GTEx_tpm)

Read in the APPRIS isoform data

See APPRIS for column details… SANITY CHECK: This steps check that each gene of interest has an APPRIS isoform tag. If not, it will obtain the gene_id from biomaRt using the gene_name. No transcript_ids or CCDS_id will be obtained and the ‘status’ column will be flagged with a “minor” APPRIS designation.

source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/appris_manipulation.R")
appris_df <- appris_manipulation(appris_file = appris_file, genes = genes) 
## The following genes do not have any APPRIS isoform information:
head(appris_df)

Get the GTEx testis TPM data of each APPRIS transcript

source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/appris_manipulation.R")
appris_df_tpm <- get_appris_tpm(appris_df = appris_df, testis_GTEx_tpm = testis_GTEx_tpm)
appris_df_tpm <- data.table(do.call("rbind", appris_df_tpm)) %>% `colnames<-`(c("gene_name", "gene_id", "transcript_id", "CCDS", "status", "avg_GTEx_TPM"))

appris_df_tpm
## See which appris isoforms did not have GTEx values 
#genes[! genes %in% unique(appris_df_tpm$gene_name)]

Get GTEx testis TPM data for transcripts not in the APPRIS dataset

These are transcripts that are not identified as PRINCIPAL isoforms by APPRIS, but have avg TPM values across all testis samples > the minimum value of the APPRIS-defined PRINCIPAL isoforms.

NOTE: The GTEx-only transcript must have an avg TPM > 1 across all testis samples to be included in the dataset

source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtex_transcript_tpm_manipulation.R")
appris_gtex_transcript_tpm <- gtex_transcript_tpm_manipulation(appris_df_tpm = appris_df_tpm, testis_GTEx_tpm = testis_GTEx_tpm, moderate_gene_ids = moderate_gene_ids)

# ## Run this command to filter out appris exons not reliably defined as principal exons
# appris_gtex_transcript_tpm <- appris_gtex_transcript_tpm[!(status %in% c("PRINCIPAL:4", "PRINCIPAL:5", "ALTERNATIVE:1", "ALTERNATIVE:2"))]
# file.remove("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_df.Rdata")

appris_gtex_transcript_tpm

Use GTF file to get exon positions within each transcript (multiple transcripts belong to the same gene)

## This is the longest step, see if the data already exists 
if (file.exists("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_df.Rdata")) {
  load("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_df.Rdata")
}

if (!file.exists("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_df.Rdata")) {
  ## Read in the gtf file 
  gtf <- fread(gtf_file, header=FALSE) %>% .[,c(1,4,5,9)] %>% `colnames<-` (c("chr", "start", "stop", "info")) 
  
  ## Get exon positions
  source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_manipulation.R")
  gtf_df <- gtf_manipulation(gtf = gtf, df = appris_gtex_transcript_tpm)
  head(gtf_df)
  save.image("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/gtf_df.Rdata")
  
  ## Write output to text file 
  write.table(x = gtf_df, file = "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/output/appris_gtex_gtf_testis.txt",
              quote = FALSE, sep = "\t", row.names = FALSE)  
}

Get the Pfam domain information for each exon

Schematic depicting how different exons across isoforms of a gene are merged.

## Output the warning messages of the pfam domain exon analysis 
if (file.exists("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/get_pfam_log.txt")) {
  file.remove("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/get_pfam_log.txt")
}
## [1] TRUE
## Capture output to a file
sink_file <- file("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/get_pfam_log.txt", open = "wt")
sink(sink_file)
sink(sink_file, type = "message")

## Run the function
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/get_pfam.R")
pfam_df <- get_pfam(df = gtf_df)
## [1] "ENST00000263826" "AKT3"           
## [1] "ENST00000508376" "APC"            
## [1] "ENST00000257430" "APC"            
## [1] "ENST00000375320" "APOA1"          
## [1] "ENST00000375323" "APOA1"          
## [1] "ENST00000359492" "APOA1"          
## [1] "ENST00000236850" "APOA1"          
## [1] "ENST00000374690" "AR"             
## [1] "ENST00000452508" "ATM"            
## [1] "ENST00000278616" "ATM"            
## [1] "ENST00000288602" "BRAF"           
## [1] "ENST00000496384" "BRAF"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000419060'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000357654" "BRCA1"          
## [1] "ENST00000471181" "BRCA1"          
## [1] "ENST00000352993" "BRCA1"          
## [1] "ENST00000461574" "BRCA1"          
## [1] "gene: BRCA1 with transcript: ENST00000461574 do not have any pfam information"
## [1] "ENST00000544455" "BRCA2"          
## [1] "ENST00000380152" "BRCA2"          
## [1] "ENST00000634840" "CBL"            
## [1] "ENST00000264033" "CBL"            
## [1] "ENST00000634586" "CBL"            
## [1] "ENST00000498124" "CDKN2A"         
## [1] "gene: CDKN2A with transcript: ENST00000498124 do not have any pfam information"
## [1] "ENST00000304494" "CDKN2A"         
## [1] "gene: CDKN2A with transcript: ENST00000304494 do not have any pfam information"
## [1] "ENST00000479692" "CDKN2A"         
## [1] "gene: CDKN2A with transcript: ENST00000479692 do not have any pfam information"
## [1] "ENST00000497750" "CDKN2A"         
## [1] "gene: CDKN2A with transcript: ENST00000497750 do not have any pfam information"
## [1] "ENST00000578845" "CDKN2A"         
## [1] "gene: CDKN2A with transcript: ENST00000578845 do not have any pfam information"
## [1] "ENST00000579755" "CDKN2A"         
## [1] "ENST00000349496" "CTNNB1"         
## [1] "ENST00000450969" "CTNNB1"         
## [1] "gene: CTNNB1 with transcript: ENST00000450969 do not have any pfam information"
## [1] "ENST00000405570" "CTNNB1"         
## [1] "ENST00000433400" "CTNNB1"         
## [1] "gene: CTNNB1 with transcript: ENST00000433400 do not have any pfam information"
## [1] "ENST00000396183" "CTNNB1"         
## [1] "ENST00000396185" "CTNNB1"         
## [1] "ENST00000431914" "CTNNB1"         
## [1] "gene: CTNNB1 with transcript: ENST00000431914 do not have any pfam information"
## [1] "ENST00000441708" "CTNNB1"         
## [1] "gene: CTNNB1 with transcript: ENST00000441708 do not have any pfam information"
## [1] "ENST00000453024" "CTNNB1"         
## [1] "ENST00000336079" "DDX3Y"          
## [1] "ENST00000360160" "DDX3Y"          
## [1] "ENST00000527414" "DICER1"         
## [1] "ENST00000343455" "DICER1"         
## [1] "ENST00000393063" "DICER1"         
## [1] "ENST00000526495" "DICER1"         
## [1] "ENST00000532939" "DICER1"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000452556'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000556045" "DICER1"         
## [1] "ENST00000264709" "DNMT3A"         
## [1] "ENST00000321117" "DNMT3A"         
## [1] "ENST00000402667" "DNMT3A"         
## [1] "ENST00000380746" "DNMT3A"         
## [1] "ENST00000300853" "ERCC1"          
## [1] "ENST00000589165" "ERCC1"          
## [1] "ENST00000013807" "ERCC1"          
## [1] "ENST00000340192" "ERCC1"          
## [1] "ENST00000423698" "ERCC1"          
## [1] "ENST00000587888" "ERCC1"          
## [1] "gene: ERCC1 with transcript: ENST00000587888 do not have any pfam information"
## [1] "ENST00000206249" "ESR1"           
## [1] "ENST00000338799" "ESR1"           
## [1] "ENST00000440973" "ESR1"           
## [1] "ENST00000443427" "ESR1"           
## [1] "ENST00000267430" "FANCM"          
## [1] "ENST00000554809" "FANCM"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000450632'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000557110" "FANCM"          
## [1] "ENST00000351936" "FGFR2"          
## [1] "ENST00000346997" "FGFR2"          
## [1] "ENST00000358487" "FGFR2"          
## [1] "ENST00000457416" "FGFR2"          
## [1] "ENST00000369061" "FGFR2"          
## [1] "ENST00000613048" "FGFR2"          
## [1] "ENST00000613324" "FGFR2"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000481464'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000638709" "FGFR2"          
## [1] "gene: FGFR2 with transcript: ENST00000638709 do not have any pfam information"
## [1] "ENST00000412135" "FGFR3"          
## [1] "ENST00000440486" "FGFR3"          
## [1] "ENST00000260795" "FGFR3"          
## [1] "ENST00000352904" "FGFR3"          
## [1] "ENST00000481110" "FGFR3"          
## [1] "ENST00000366815" "H3-3A"          
## [1] "ENST00000366816" "H3-3A"          
## [1] "ENST00000366813" "H3-3A"          
## [1] "ENST00000366814" "H3-3A"          
## [1] "ENST00000589599" "H3-3B"          
## [1] "ENST00000587560" "H3-3B"          
## [1] "ENST00000586607" "H3-3B"          
## [1] "ENST00000254810" "H3-3B"          
## [1] "ENST00000586270" "H3-3B"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000465403'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000591890" "H3-3B"          
## [1] "ENST00000592643" "H3-3B"          
## [1] "ENST00000451590" "HRAS"           
## [1] "ENST00000311189" "HRAS"           
## [1] "ENST00000397596" "HRAS"           
## [1] "ENST00000397594" "HRAS"           
## [1] "ENST00000412167" "KIT"            
## [1] "ENST00000288135" "KIT"            
## [1] "ENST00000311936" "KRAS"           
## [1] "ENST00000256078" "KRAS"           
## [1] "ENST00000611712" "LIG4"           
## [1] "ENST00000405925" "LIG4"           
## [1] "ENST00000442234" "LIG4"           
## [1] "ENST00000307102" "MAP2K1"         
## [1] "ENST00000566326" "MAP2K1"         
## [1] "ENST00000262948" "MAP2K2"         
## [1] "ENST00000394867" "MAP2K2"         
## [1] "ENST00000401548" "MEI1"           
## [1] "gene: MEI1 with transcript: ENST00000401548 do not have any pfam information"
## [1] "ENST00000423900" "MEI1"           
## [1] "gene: MEI1 with transcript: ENST00000423900 do not have any pfam information"
## [1] "ENST00000231790" "MLH1"           
## [1] "ENST00000355774" "MLH3"           
## [1] "ENST00000380968" "MLH3"           
## [1] "ENST00000553713" "MLH3"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000451130'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000554697" "MLH3"           
## [1] "gene: MLH3 with transcript: ENST00000554697 do not have any pfam information"
## [1] "ENST00000375703" "MSH5"           
## [1] "ENST00000375755" "MSH5"           
## [1] "ENST00000375750" "MSH5"           
## [1] "ENST00000425703" "MSH5"           
## [1] "gene: MSH5 with transcript: ENST00000425703 do not have any pfam information"
## [1] "ENST00000429846" "MSH5"           
## [1] "ENST00000463144" "MSH5"           
## [1] "gene: MSH5 with transcript: ENST00000463144 do not have any pfam information"
## [1] "ENST00000358273" "NF1"            
## [1] "ENST00000356175" "NF1"            
## [1] "ENST00000471572" "NF1"            
## [1] "gene: NF1 with transcript: ENST00000471572 do not have any pfam information"
## [1] "ENST00000373588" "NR5A1"          
## [1] "ENST00000265849" "PMS2"           
## [1] "ENST00000305921" "PPM1D"          
## [1] "ENST00000392995" "PPM1D"          
## [1] "ENST00000308677" "PRKACA"         
## [1] "ENST00000303728" "PRY"            
## [1] "gene: PRY with transcript: ENST00000303728 do not have any pfam information"
## [1] "ENST00000429896" "PTCH1"          
## [1] "ENST00000437951" "PTCH1"          
## [1] "ENST00000421141" "PTCH1"          
## [1] "ENST00000375274" "PTCH1"          
## [1] "ENST00000430669" "PTCH1"          
## [1] "ENST00000331920" "PTCH1"          
## [1] "ENST00000418258" "PTCH1"          
## [1] "ENST00000375290" "PTCH1"          
## [1] "ENST00000551630" "PTCH1"          
## [1] "gene: PTCH1 with transcript: ENST00000551630 do not have any pfam information"
## [1] "ENST00000635625" "PTPN11"         
## [1] "ENST00000351677" "PTPN11"         
## [1] "ENST00000251849" "RAF1"           
## [1] "ENST00000442415" "RAF1"           
## [1] "ENST00000299440" "RAG1"           
## [1] "ENST00000267163" "RB1"            
## [1] "ENST00000355710" "RET"            
## [1] "ENST00000638465" "RET"            
## [1] "gene: RET with transcript: ENST00000638465 do not have any pfam information"
## [1] "ENST00000342988" "SMAD4"          
## [1] "ENST00000398417" "SMAD4"          
## [1] "ENST00000611848" "SMAD4"
## Warning: Could not find a CDS whith the expected length for protein:
## 'ENSP00000478613'. The returned genomic coordinates might thus not be correct
## for this protein.
## [1] "ENST00000395038" "SOS1"           
## [1] "ENST00000404395" "STAT3"          
## [1] "ENST00000588969" "STAT3"          
## [1] "ENST00000264657" "STAT3"          
## [1] "ENST00000326873" "STK11"          
## [1] "ENST00000586243" "STK11"          
## [1] "ENST00000392927" "SYCP3"          
## [1] "ENST00000266743" "SYCP3"          
## [1] "ENST00000392924" "SYCP3"          
## [1] "ENST00000374333" "TEX11"          
## [1] "ENST00000395889" "TEX11"          
## [1] "ENST00000344304" "TEX11"          
## [1] "ENST00000374320" "TEX11"          
## [1] "ENST00000638951" "TEX15"          
## [1] "gene: TEX15 with transcript: ENST00000638951 do not have any pfam information"
## [1] "ENST00000256246" "TEX15"          
## [1] "ENST00000298171" "TSHR"           
## [1] "ENST00000541158" "TSHR"           
## [1] "ENST00000338981" "USP9Y"          
## [1] "ENST00000453031" "USP9Y"          
## [1] "gene: USP9Y with transcript: ENST00000453031 do not have any pfam information"
## [1] "ENST00000256474" "VHL"            
## [1] "ENST00000345392" "VHL"            
## [1] "ENST00000332351" "WT1"            
## [1] "ENST00000639563" "WT1"            
## [1] "gene: WT1 with transcript: ENST00000639563 do not have any pfam information"
## [1] "ENST00000379077" "WT1"            
## [1] "ENST00000379079" "WT1"            
## [1] "ENST00000530998" "WT1"            
## [1] "ENST00000375128" "XPA"            
## [1] "ENST00000262887" "XRCC1"          
## [1] "ENST00000543982" "XRCC1"
## Output the data
pfam_df
## Revert output back to the console
sink(type = "message")
sink()

Merge overlapping exons across transcripts of the same gene

Schematic depicting how different exons across isoforms of a gene are merged.

source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/merge_exons.R")
merged_exons <- merge_exons(df = pfam_df)

## Get the total panel space (exonic)
total <- sum(merged_exons$exon_size)

## Calculate proportion of panel space taken up by each gene 
merged_exons$proportion_total <- merged_exons$total_nucleotides/total*100
#head(merged_exons)

## Remove exons found below threshold of total proportion of transcripts for a given gene:
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/transcript_proportion_filter.R")
merged_exons <- transcript_proportion_filter(merged_exons = merged_exons)

## Write the output file 
write.table(x = merged_exons, file = paste0("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/output/merged_exons_.txt"), 
            quote = FALSE, sep = "\t", row.names = FALSE)

Output of master data

Summary

summary(merged_exons$exon_size)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     3.0    92.0   125.0   181.3   164.0  7313.0

All exons

## Factor the data so that it plots the x-axis in an order
merged_exons$x_axis <- paste0(merged_exons$gene_name, " (n = " , merged_exons$total_exons, " exons; avg = ", merged_exons$avg_exon_size, ")")
merged_exons$x_axis <- factor(merged_exons$x_axis, levels = unique(merged_exons$x_axis[order(merged_exons$total_nucleotides, decreasing = TRUE)]))

## Generate scatter plot --> Exons (y) per gene (x)
p1 <- ggplot(data = merged_exons) + geom_point(aes(x = x_axis, y = exon_size, color = transcript_num)) + 
  theme(axis.ticks.x = element_blank(), axis.text.x = element_text(angle = 75, hjust = 1)) + 
  labs(x = "Gene", y = paste0("Exon Size"), color="# of Transcripts") + scale_y_log10() + 
  scale_color_gradient(low = "grey", high = "red")

## Show proportion of panel space taken up by each gene 
p2 <- ggplot(data = merged_exons) + geom_line(aes(x = x_axis, y = proportion_total, group = 1), color = "Blue") + 
  theme(axis.ticks.x = element_blank(), axis.text.x = element_blank(), axis.title.x = element_blank()) + 
  labs(y = paste0("Proportion of", "\n", "Total (%)", "\n")) + 
  annotate(geom = "text", x = 40, y = 5, label = paste0("Total = ", sum(merged_exons$exon_size), " nucleotides")) + 
  geom_vline(xintercept = 9.5, linetype = 2, color = "red") 

ggarrange(p2, p1, nrow=2, ncol=1, common.legend = TRUE, legend = "right", heights = c(0.5, 2))

Prioritization of exons in the gene list of interest:

Necessary files

Cancer Hotspot Datasets

## Download [3D Hotspots](http://www.3dhotspots.org/#/home) dataset and manually save as a txt file
wget http://www.3dhotspots.org/files/3d_hotspots.xls

## Download  [Cancer Hotspots](https://www.cancerhotspots.org/#/home) MAF file
cd ~/spermseq/data/hotspots
wget http://download.cbioportal.org/cancerhotspots/cancerhotspots.v2.maf.gz 
zless cancerhotspots.v2.maf.gz | grep -v "#" | grep -w "protein_coding" > cancerhotspots.v2.maf
zless cancerhotspots.v2.maf.gz | grep -v "#" | head -n 1 > cancerhotspots.v2.maf.header
cat cancerhotspots.v2.maf.header cancerhotspots.v2.maf > cancerhotspots.v2.proteincoding.maf
cat cancerhotspots.v2.proteincoding.maf | cut -f 1,2,5,6,7,38 > cancerhotspots.v2.proteincoding.columns.maf
## Copy to local directory
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/hotspots/cancerhotspots.v2.proteincoding.columns.maf ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/data

## Download the liftover file to convert hg19 coordinates from MAF file to hg38
wget https://hgdownload.cse.ucsc.edu/goldenpath/hg19/liftOver/hg19ToHg38.over.chain.gz 
gunzip hg19ToHg38.over.chain.gz > hg19ToHg38.over.chain
scp u1240855@kingspeak.chpc.utah.edu:~/spermseq/data/hotspots/hg19ToHg38.over.chain ~/Google\ Drive/Quinlan\ Lab\ -\ PhD/Projects/spermseq/twinstrand_target_gene_set/data

Set up R environment

Identify exons with 1+ cancer 3D hotspot mutation

## Exons with 3d hotspot mutations 
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/format_hotspot.R")
merged_exons_3d_hotspot <- format_3D_hotspot(hotspot_file = "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/3d_hotspots_gao.txt",
                                             genes = genes, merged_exons = merged_exons)

Identify exons with 1+ cancer hotspot mutation

## Exons with snp/onp/ins/del hotspot mutations
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/format_hotspot.R")
hotspot_file <- "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/data/cancerhotspots.v2.proteincoding.columns.maf"
merged_exons_hotspot <- format_hotspot(hotspot_file = hotspot_file, genes = genes, merged_exons_3d_hotspot = merged_exons_3d_hotspot)
## [1] "CDKN2A"
## [1] "multiple_transcripts"
merged_exons_hotspot

Calcualte GC content of exons from merged exon list

## Load in necessary packages
library(BSgenome.Hsapiens.UCSC.hg38)
library(BSgenome)
library(GenomicRanges)

## Define GetGC function
GetGC <- function(bsgenome, gr){
  
  seqs <- BSgenome::getSeq(bsgenome, gr)
  return(as.numeric(Biostrings::letterFrequency(x = seqs, letters = "GC", as.prob = TRUE)))
  
}

## Get the IRanges
ranges <- IRanges(start = merged_exons_hotspot$start, end = merged_exons_hotspot$stop)

## Get GRanges object
gr <- GRanges(seqnames = paste0("chr", merged_exons_hotspot$chr), ranges=ranges)

## Get the gc content of each sequence 
merged_exons_hotspot$gc_content <- GetGC(bsgenome = BSgenome.Hsapiens.UCSC.hg38, gr = gr)
write.table(x = merged_exons_hotspot, file = "~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/twinstrand_target_gene_set/output/merged_exons_final.txt",
            sep = "\t", quote = FALSE)

## Save the data
save.image("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/final.Rdata")

Plots with different filters

Identify exons overlapping functional domains and/or harbor cancer hotspot mutations. Identify exons that are found in a certain proportion of transcripts for a given gene.

Exons in functional domains and harbor hotspots

## Filter out exons/rows that have NA values in pfam_id column
merged_exons_pfam <- na.omit(merged_exons_hotspot)
merged_exons_pfam_hotspot <- merged_exons_pfam[hotspot_3d_exon_num > 0 | hotspot_exon_num > 0]

## Calculate proportion of panel space taken up by each gene 
total <- sum(merged_exons_pfam_hotspot$exon_size)
merged_exons_pfam_hotspot <- rbindlist(lapply(unique(merged_exons_pfam_hotspot$gene_id), function(x, merged_exons_pfam_hotspot) {
  temp <- merged_exons_pfam_hotspot[gene_id == x]
  temp$total_nucleotides <- sum(temp$exon_size)
  temp$total_exons <- nrow(temp)
  temp$transcript_num <- (temp$transcript_id %>% strsplit(., split = " | "))[[1]] %>% unique() %>% length()
  temp$avg_exon_size <- round(mean(temp$exon_size), digits = 2)
  return(temp)
}, merged_exons_pfam_hotspot=merged_exons_pfam_hotspot))
merged_exons_pfam_hotspot$proportion_total <- merged_exons_pfam_hotspot$total_nucleotides/total*100

source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = merged_exons_pfam_hotspot, genes = genes)
p

Exons in functional domains

## Filter out exons/rows that have NA values in pfam_id column
merged_exons_pfam <- na.omit(merged_exons_hotspot)

## Calculate proportion of panel space taken up by each gene 
total <- sum(merged_exons_pfam$exon_size)
merged_exons_pfam <- rbindlist(lapply(unique(merged_exons_pfam$gene_id), function(x, merged_exons_pfam) {
  temp <- merged_exons_pfam[gene_id == x]
  temp$total_nucleotides <- sum(temp$exon_size)
  temp$total_exons <- nrow(temp)
  temp$transcript_num <- (temp$transcript_id %>% strsplit(., split = " | "))[[1]] %>% unique() %>% length()
  temp$avg_exon_size <- round(mean(temp$exon_size), digits = 2)
  return(temp)
}, merged_exons_pfam=merged_exons_pfam))
merged_exons_pfam$proportion_total <- merged_exons_pfam$total_nucleotides/total*100

source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = merged_exons_pfam, genes = genes)
p

Transcript Number Exon Filter for Pfam Exons: 20

## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.2]

## Generate plot 
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p

Transcript Number Exon Filter for Pfam Exons: 40

## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.4]

## Generate plot 
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p

Transcript Number Filter for Pfam Exons: 50

## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.5]

## Generate plot 
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p

Transcript Number Filter for Pfam Exons: 60

## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.6]

## Generate plot 
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p

Transcript Number Exon Filter for Pfam Exons: 80

## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion >= 0.8]

## Generate plot 
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p

Transcript Number Exon Filter for Pfam Exons: 100

## Subset the final dataset
final <- merged_exons_pfam[transcript_proportion == 1]

## Generate plot 
source("~/Google Drive/Quinlan Lab - PhD/Projects/spermseq/script/plot_df.R")
p <- plot_df(df = final, genes = genes)
p

Archive